library(tidyverse)
## -- Attaching packages -------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ----------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
## Warning: package 'knitr' was built under R version 3.5.3
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
options(scipen = 4)
First, I download the nlsy79 data from the course website and import dataset.
nlsy <- read_csv("nlsy79_income.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
# In the variables description file, we can see that there are over 70 variables and names are long, so I change the reference numbers to the question name.
colnames(nlsy) <- c("versionid",
"caseid",
"birth_country",
"FAM-POB_1979",
"FAM-3_1979",
"FAM-3A_1979",
"FAM-RES_1979",
"FAM-6_1979",
"R_REL-1_COL_1979",
"expect_education",
"army",
"WOMENS-ROLES_000001_1979",
"WOMENS-ROLES_000002_1979",
"WOMENS-ROLES_000003_1979",
"WOMENS-ROLES_000004_1979",
"WOMENS-ROLES_000006_1979",
"WOMENS-ROLES_000007_1979",
"WOMENS-ROLES_000008_1979",
"EXP-OCC_1979",
"EXP-9_1979",
"race",
"gender",
"MARSTAT-KEY_1979",
"FAMSIZE_1979",
"povstatus_1979",
"police_1980",
"POLIC-1C_1980",
"POLICE-2_1980",
"ALCH-2_1983",
"num_drug_1984",
"DS-9_1984",
"Q13-5_TRUNC_REVISED_1990",
"POVSTATUS_1990",
"HGCREV90_1990",
"jobs.num",
"NUMCH90_1990",
"AGEYCH90_1990",
"DS-12_1998",
"DS-13_1998",
"INDALL-EMP.01_2000",
"CPSOCC80.01_2000",
"OCCSP-55I_CODE_2000",
"Q2-15B_2000",
"Q10-2_2000",
"Q13-5_TRUNC_REVISED_2000",
"FAMSIZE_2000",
"TNFI_TRUNC_2000",
"POVSTATUS_2000",
"marriage_col_2000",
"MARSTAT-KEY_2000",
"MO1M1B_XRND",
"Q2-10B~Y_2012",
"jobtype_2012",
"OCCALL-EMP.01_2012",
"OCCSP-55I_CODE_2012",
"Q2-15A_2012",
"Q12-6_2012",
"income",
"Q13-5_SR000001_2012",
"Q13-5_SR000002_2012",
"Q13-18_TRUNC_2012",
"Q13-18_SR000001_TRUNC_2012",
"familysize_2012",
"REGION_2012",
"HGC_2012",
"URBAN-RURAL_2012",
"jobsnum_2012")
# Some table summaries
table.mean.income.0 <- nlsy %>%
group_by(gender,race) %>%
summarize(mean.income.0 = round(mean(income), 0))
table.mean.income.0
## # A tibble: 6 x 3
## # Groups: gender [2]
## gender race mean.income.0
## <dbl> <dbl> <dbl>
## 1 1 1 30047
## 2 1 2 21417
## 3 1 3 30998
## 4 2 1 19661
## 5 2 2 17133
## 6 2 3 16232
# Some graphic summaries
ggplot(data=nlsy, aes(x=gender, y=income, colour = expect_education)) + geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
Discussion: We can see from the rough table and graphic summary dividing income level by gender and race that there are some problems: First, we did not recode factor variables from raw dataset so we do not know what does each number refers to. Second, there are a lot of negative values in raw datasets, which distort the acuuracy of our analysis. Third, we want to explore the differences in income by gender, the raw data hardly shows the result that we want.
# There are over 67 columns in our dataset but we do not need all of those so I first select useful columns that I may need
subset.nlsy <- nlsy %>%
select("versionid","caseid","birth_country","expect_education","race","gender","income",
"num_drug_1984","marriage_col_2000","familysize_2012","jobsnum_2012")
# Remove missing values to get a cleaner dataset
subset.nlsy[subset.nlsy < 0] <-NA
new.nlsy <- na.omit(subset.nlsy)
new.nlsy
## # A tibble: 6,342 x 11
## versionid caseid birth_country expect_education race gender income
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 445 2 2 12 3 2 19000
## 2 445 3 1 14 3 2 35000
## 3 445 6 1 18 3 1 105000
## 4 445 8 1 14 3 2 40000
## 5 445 9 1 13 3 1 75000
## 6 445 14 1 17 3 2 102000
## 7 445 15 1 18 3 1 0
## 8 445 16 1 13 3 2 70000
## 9 445 17 1 14 3 1 60000
## 10 445 18 1 18 3 1 150000
## # ... with 6,332 more rows, and 4 more variables: num_drug_1984 <dbl>,
## # marriage_col_2000 <dbl>, familysize_2012 <dbl>, jobsnum_2012 <dbl>
Discussion: I select 11 varibales that maybe useful for my analysis to create a new dataframe called new.nlsy and I remove all negative values(NA).
# Convert variables to factors and give the factors more meaningful levels
new.nlsy <- mutate(new.nlsy,
race = recode_factor(race,
`3` = "other",
`2` = "black",
`1` = "hispanic"),
gender = recode_factor(gender,
`1` = "male",
`2` = "female"))
new.nlsy <- mutate(new.nlsy,
num_drug_1984 = recode_factor(num_drug_1984,
`0` = "never",
`1` = "1-9",
`2` = "10-39",
`3` = "40-99",
`4` = "100-999",
`5` = "1000+"),
marriage_col_2000 = recode_factor(marriage_col_2000,
`2` = "married",
`1` = "other",
`3` = "other"),
birth_country = recode_factor(birth_country,
`1` = "US",
`2` = "Other"))
new.nlsy <- mutate(new.nlsy,
expect_education = recode_factor(expect_education,
`13`= "college",
`14`= "college",
`15`= "college",
`16`= "college",
`17`= "college",
`18`= "college",
`7`= "middle_high",
`8`= "middle_high",
`9`= "middle_high",
`10`= "middle_high",
`11`= "middle_high",
`12`= "middle_high",
`1`= "primary",
`2`= "primary",
`3`= "primary",
`4`= "primary",
`5`= "primary",
`6`= "primary"
))
new.nlsy
## # A tibble: 6,342 x 11
## versionid caseid birth_country expect_education race gender income
## <dbl> <dbl> <fct> <fct> <fct> <fct> <dbl>
## 1 445 2 Other middle_high other female 19000
## 2 445 3 US college other female 35000
## 3 445 6 US college other male 105000
## 4 445 8 US college other female 40000
## 5 445 9 US college other male 75000
## 6 445 14 US college other female 102000
## 7 445 15 US college other male 0
## 8 445 16 US college other female 70000
## 9 445 17 US college other male 60000
## 10 445 18 US college other male 150000
## # ... with 6,332 more rows, and 4 more variables: num_drug_1984 <fct>,
## # marriage_col_2000 <fct>, familysize_2012 <dbl>, jobsnum_2012 <dbl>
Discussion: From the dataset, we cans see that most variables are categorical ones and for the convenience of my analysis, I recode some of them and choose baseline variables. For marriage status, I combine never married and other into other; for expected education, I combine 18 levels into primary school, middle_high school and college for better analysis.
# simple summary of the data
str(new.nlsy)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6342 obs. of 11 variables:
## $ versionid : num 445 445 445 445 445 445 445 445 445 445 ...
## $ caseid : num 2 3 6 8 9 14 15 16 17 18 ...
## $ birth_country : Factor w/ 2 levels "US","Other": 2 1 1 1 1 1 1 1 1 1 ...
## $ expect_education : Factor w/ 3 levels "college","middle_high",..: 2 1 1 1 1 1 1 1 1 1 ...
## $ race : Factor w/ 3 levels "other","black",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ gender : Factor w/ 2 levels "male","female": 2 2 1 2 1 2 1 2 1 1 ...
## $ income : num 19000 35000 105000 40000 75000 102000 0 70000 60000 150000 ...
## $ num_drug_1984 : Factor w/ 6 levels "never","1-9",..: 1 5 5 2 5 5 1 5 5 6 ...
## $ marriage_col_2000: Factor w/ 2 levels "married","other": 1 1 1 2 1 2 1 1 2 2 ...
## $ familysize_2012 : num 3 2 5 3 3 1 6 4 1 1 ...
## $ jobsnum_2012 : num 3 17 12 13 5 24 9 12 14 4 ...
summary(new.nlsy)
## versionid caseid birth_country expect_education
## Min. :445 Min. : 2 US :5952 college :3421
## 1st Qu.:445 1st Qu.: 2479 Other: 390 middle_high:2895
## Median :445 Median : 4970 primary : 26
## Mean :445 Mean : 5294
## 3rd Qu.:445 3rd Qu.: 7865
## Max. :445 Max. :12679
## race gender income num_drug_1984
## other :3221 male :3024 Min. : 0 never :2459
## black :1940 female:3318 1st Qu.: 1000 1-9 :1681
## hispanic:1181 Median : 30000 10-39 : 707
## Mean : 42102 40-99 : 487
## 3rd Qu.: 56000 100-999: 572
## Max. :343830 1000+ : 436
## marriage_col_2000 familysize_2012 jobsnum_2012
## married:3577 Min. : 1.000 Min. : 0.00
## other :2765 1st Qu.: 2.000 1st Qu.: 7.00
## Median : 2.000 Median :11.00
## Mean : 2.627 Mean :12.25
## 3rd Qu.: 3.000 3rd Qu.:16.00
## Max. :16.000 Max. :58.00
Discussion: From str() fuction, we can see that some variables are recoded into factor variables.
# Make some tables to see the average income when broke down by gender and other factor variables
table.mean.income.1 <- new.nlsy %>%
group_by(gender,race) %>%
summarize(mean.income.1 = round(mean(income), 0))
kable(spread(table.mean.income.1, gender, mean.income.1),
format = "markdown")
| race | male | female |
|---|---|---|
| other | 70789 | 33937 |
| black | 34061 | 24108 |
| hispanic | 47884 | 29033 |
table.mean.income.2 <- new.nlsy %>%
group_by(gender,expect_education) %>%
summarize(mean.income.2 = round(mean(income), 0))
kable(spread(table.mean.income.2, gender, mean.income.2),
format = "markdown")
| expect_education | male | female |
|---|---|---|
| college | 73759 | 37968 |
| middle_high | 35401 | 20207 |
| primary | 23000 | 6102 |
table.mean.income.3 <- new.nlsy %>%
group_by(gender,marriage_col_2000, familysize_2012) %>%
summarize(mean.income.3 = round(mean(income), 0))
table.mean.income.3
## # A tibble: 42 x 4
## # Groups: gender, marriage_col_2000 [4]
## gender marriage_col_2000 familysize_2012 mean.income.3
## <fct> <fct> <dbl> <dbl>
## 1 male married 1 49101
## 2 male married 2 60862
## 3 male married 3 72436
## 4 male married 4 86635
## 5 male married 5 88478
## 6 male married 6 73609
## 7 male married 7 32644
## 8 male married 8 31167
## 9 male married 11 114000
## 10 male other 1 32928
## # ... with 32 more rows
table.mean.income.4 <- new.nlsy %>%
group_by(gender,birth_country, num_drug_1984) %>%
summarize(mean.income.4 = round(mean(income), 0))
table.mean.income.4
## # A tibble: 24 x 4
## # Groups: gender, birth_country [4]
## gender birth_country num_drug_1984 mean.income.4
## <fct> <fct> <fct> <dbl>
## 1 male US never 56453
## 2 male US 1-9 58145
## 3 male US 10-39 65306
## 4 male US 40-99 56023
## 5 male US 100-999 54894
## 6 male US 1000+ 33795
## 7 male Other never 46528
## 8 male Other 1-9 50854
## 9 male Other 10-39 92285
## 10 male Other 40-99 89535
## # ... with 14 more rows
Discussion: In order to investigate income differences by gender, I create some tables exploring the relationship between average income and gender, race, expected education and so on. For example, in table four, the average income of male born in US having 10-39 times of drug even higher than who never has drug not as commonly thought. Therefore, we should further discuss what factors influence the income differences by gender.
# Some graphics showing the relationship between gender and income
#qplot
qplot(x=gender, y=income, data=new.nlsy,
color = race,
shape = race,
xlab = "gender",
ylab = "income in 2012")
# histogram
ggplot(new.nlsy, aes(x = income)) + xlab("income in 2012") + geom_histogram(aes(fill = gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
w <- ggplot(new.nlsy, aes(x = income)) + xlab("income in 2012") + geom_histogram(aes(fill = race))
ggplotly(w)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# boxplot and violin plot
ggplot(data=new.nlsy, aes(x=expect_education, y=income, colour = gender)) + geom_boxplot()
s <- ggplot(data=new.nlsy, aes(x=race, y=income, colour = gender)) + geom_boxplot()
ggplotly(s)
plot.1 <- ggplot(new.nlsy, aes(x = as.factor(familysize_2012), y = income)) +
xlab("familysize in 2012") +
ylab("income in 2012")
plot.1 + geom_boxplot()
# If I want to visualize the mean table did in the last part, for example, the barchart of race with income different by gender.
plot.2 <- ggplot(data = table.mean.income.1,
aes(y = mean.income.1, x = race, fill = gender))
color.2 <- c("#D55E00", "#0072B2")
plot.2 + geom_bar(stat = "identity", position = "dodge") +
ylab("average income in 2012") +
xlab("race") +
guides(fill = guide_legend(title = "people's average income in 2012")) +
scale_fill_manual(values=color.2)
# If I want to visualize the mean table, for example, the barchart of expect_education with income different by gender.
table.mean.income.5 <- new.nlsy %>%
group_by(gender,familysize_2012) %>%
summarize(mean.income.5 = round(mean(income), 0))
table.mean.income.5
## # A tibble: 24 x 3
## # Groups: gender [2]
## gender familysize_2012 mean.income.5
## <fct> <dbl> <dbl>
## 1 male 1 36773
## 2 male 2 51510
## 3 male 3 59732
## 4 male 4 78366
## 5 male 5 78242
## 6 male 6 71082
## 7 male 7 25520
## 8 male 8 23375
## 9 male 9 0
## 10 male 10 0
## # ... with 14 more rows
plot.3 <- ggplot(data = table.mean.income.5,
aes(y = mean.income.5, x = familysize_2012, fill = gender))
color.3 <- c("#009E73", "#999999")
plot.3 + geom_bar(stat = "identity", position = "dodge") +
ylab("average income in 2012") +
xlab("family size in 2012") +
guides(fill = guide_legend(title = "people's average income in 2012")) +
scale_fill_manual(values=color.3)
# If I want to visualize the association between income and familysize_2012 depending on gender
ggplot(new.nlsy,
aes(x=familysize_2012, y=income, shape=gender, color=gender)) +
geom_point() +
geom_smooth(method = "lm") +
ylab("income in 2012") +
xlab("number of times having drug in 1984") +
ggtitle("income in 2012 by drug condition")
# density plot
qplot(fill = gender, x = income, data = new.nlsy, geom = "density",
alpha = I(0.5),
adjust = 1.5,
xlim = c(-4, 6))
## Warning: Removed 4825 rows containing non-finite values (stat_density).
Dicussion: From various kinds of ggplots, we can roughly see that income/average income in 2012 is not normally distributed bewtween male and female.We can see that over 1500 respondents’s income equal zero. Topcoded income(outlier) somewhat affects the distribution of income broken by gender. People in larger family size are more likely to earn higher income. Higher level of expected education are more likely to have higher income.
# Find whether there is any correlations between familysize and income
cor(new.nlsy$familysize_2012,new.nlsy$income)
## [1] 0.08361683
# Does the correlation vary by gender?
new.nlsy %>%
group_by(gender) %>%
summarize(cor_income_family = cor(income, familysize_2012))
## # A tibble: 2 x 2
## gender cor_income_family
## <fct> <dbl>
## 1 male 0.179
## 2 female -0.0344
# Does the correlation vary by education and num_drug_1984?
new.nlsy %>%
group_by(expect_education, num_drug_1984) %>%
summarize(cor_income_family = cor(income, familysize_2012))
## # A tibble: 14 x 3
## # Groups: expect_education [3]
## expect_education num_drug_1984 cor_income_family
## <fct> <fct> <dbl>
## 1 college never 0.0685
## 2 college 1-9 0.139
## 3 college 10-39 0.0122
## 4 college 40-99 0.208
## 5 college 100-999 0.169
## 6 college 1000+ -0.0310
## 7 middle_high never -0.00866
## 8 middle_high 1-9 0.0647
## 9 middle_high 10-39 0.149
## 10 middle_high 40-99 0.102
## 11 middle_high 100-999 0.00399
## 12 middle_high 1000+ 0.103
## 13 primary never -0.359
## 14 primary 1-9 -1
Discussion: I decide to run some correlations between income, gender and other factor variables.Broken down by gender, the correlation between income and familysize within female is -0.0344, which is suprising. Also, when broken down by expected education and number of drugs used in 1984, people who are expected to have college, havig drugs 1000+ times and who are expected to have middle_high never had drug before show negative correlation too.
# Testing differences in income bewteen male and female
qplot(x = gender, y = income,
geom = "boxplot", data = new.nlsy,
xlab = "gender",
ylab = "income in 2012",
fill = I("lightblue"))
Discussion: First I use boxplot to see the distribution of income level broken down by gender. Generally, male’s income is high than female’s. The median of male’s income is around 50000 and the median of female’s income is around 25000.
# Find the mean, standard deviation and standard errors to see wether the relationship between gender and income is statistically significant
new.nlsy %>%
group_by(gender) %>%
summarize(num.obs = n(),
mean.income = round(mean(income), 0),
sd.income = round(sd(income), 0),
se.income = round(sd(income) / sqrt(num.obs), 0))
## # A tibble: 2 x 5
## gender num.obs mean.income sd.income se.income
## <fct> <int> <dbl> <dbl> <dbl>
## 1 male 3024 55384 70465 1281
## 2 female 3318 29996 35882 623
Discussion: we can see that the average income of male is much more larger than average income of female, but male’s standard deviation is much larger than female’s. Female’s standard errors in income is very small, whcih means that the sample mean is very close to population mean.
# Run a two-sample t-test
income.t.test <- t.test(income ~ gender, data = new.nlsy)
income.t.test
##
## Welch Two Sample t-test
##
## data: income by gender
## t = 17.819, df = 4396.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 22594.80 28181.37
## sample estimates:
## mean in group male mean in group female
## 55384.15 29996.07
Discussion: We can see that the p value is smaller than 2.2e-16, which means that we are 95% confident that the difference in mean between male and female is statistically significant.
# p value
income.t.test$p.value
## [1] 1.23916e-68
Discussion: The ttest is very small.
# group means in male and female
income.t.test$estimate
## mean in group male mean in group female
## 55384.15 29996.07
Discussion: There is significant differences between average income between male and female (55384 and 29996). We are 95% confident that averge income in male is $25388 higher than in female.
# confidence interval for the difference
income.t.test$conf.int
## [1] 22594.80 28181.37
## attr(,"conf.level")
## [1] 0.95
Discussion: The confidence interval is [22594.80,28181.37]
# Also, try to run a wilcox test
income.wilcox.test <- wilcox.test(income ~ gender, data=new.nlsy, conf.int=TRUE)
income.wilcox.test
##
## Wilcoxon rank sum test with continuity correction
##
## data: income by gender
## W = 6232500, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 12000 16000
## sample estimates:
## difference in location
## 14000
Discussion: When running a wilcoxcon test, the p value is also smaller than 2.2e-16, which is statitiscally significant.
# exlpore whether data is normal
ggplot(data = new.nlsy, aes(sample = income)) + stat_qq() + stat_qq_line() + facet_grid(. ~ gender)
Discussion: First we can see from the qqplot that there is extreme high income. Also, it appears that our data is not normally distributed on the regression line.
# Use ANOVA to test whether there is a significant association between gender and income
summary(aov(income ~ gender, data = new.nlsy))
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 1.020e+12 1019745466082 335.3 <2e-16 ***
## Residuals 6340 1.928e+13 3041129420
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Use ANOVA to test whether there is a significant association between gender, race, expect_education and income
summary(aov(income ~ gender + race + expect_education, data = new.nlsy))
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 1.020e+12 1019745466082 369.3 <2e-16 ***
## race 2 6.454e+11 322709466221 116.9 <2e-16 ***
## expect_education 2 1.138e+12 569025755417 206.1 <2e-16 ***
## Residuals 6336 1.750e+13 2761567247
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Use ANOVA to test whether there is a significant association between gender, num_drug_1984, marriage_col_2000 and income
summary(aov(income ~ gender + num_drug_1984 + marriage_col_2000, data = new.nlsy))
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 1.020e+12 1019745466082 348.2 < 2e-16 ***
## num_drug_1984 5 1.729e+11 34575426088 11.8 2.19e-11 ***
## marriage_col_2000 1 5.566e+11 556603604980 190.0 < 2e-16 ***
## Residuals 6334 1.855e+13 2928841141
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Discussion: We can see from the three outputs taht all p values are snaller than 2e-16, which means that the p-value is significant at the 0.05 level, there is an association between income and gender, race, num_drug_1984, marriage_col_2000 and so on.
# regress outcome variable income against all other variables
lm.1 <- lm(income ~ ., data = new.nlsy)
lm.1
##
## Call:
## lm(formula = income ~ ., data = new.nlsy)
##
## Coefficients:
## (Intercept) versionid
## 83904.5394 NA
## caseid birth_countryOther
## -0.3244 4245.4622
## expect_educationmiddle_high expect_educationprimary
## -25934.2485 -39654.4835
## raceblack racehispanic
## -17276.9106 -10195.5902
## genderfemale num_drug_19841-9
## -27623.3063 2201.8980
## num_drug_198410-39 num_drug_198440-99
## 7427.0388 2789.2523
## num_drug_1984100-999 num_drug_19841000+
## 629.6002 -8180.1443
## marriage_col_2000other familysize_2012
## -9017.5803 1773.3934
## jobsnum_2012
## -698.4010
summary(lm.1)
##
## Call:
## lm(formula = income ~ ., data = new.nlsy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98972 -28207 -7776 14733 320924
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83904.5394 2634.4909 31.848 < 2e-16 ***
## versionid NA NA NA NA
## caseid -0.3244 0.2326 -1.394 0.163245
## birth_countryOther 4245.4622 2930.5040 1.449 0.147467
## expect_educationmiddle_high -25934.2485 1327.7997 -19.532 < 2e-16 ***
## expect_educationprimary -39654.4835 10431.5005 -3.801 0.000145 ***
## raceblack -17276.9106 1751.4076 -9.865 < 2e-16 ***
## racehispanic -10195.5902 2080.5784 -4.900 9.80e-07 ***
## genderfemale -27623.3063 1335.6608 -20.681 < 2e-16 ***
## num_drug_19841-9 2201.8980 1659.1692 1.327 0.184521
## num_drug_198410-39 7427.0388 2241.5209 3.313 0.000927 ***
## num_drug_198440-99 2789.2523 2608.9259 1.069 0.285057
## num_drug_1984100-999 629.6002 2469.0916 0.255 0.798737
## num_drug_19841000+ -8180.1443 2799.7279 -2.922 0.003493 **
## marriage_col_2000other -9017.5803 1459.4575 -6.179 6.86e-10 ***
## familysize_2012 1773.3934 489.0990 3.626 0.000290 ***
## jobsnum_2012 -698.4010 96.0148 -7.274 3.92e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51830 on 6326 degrees of freedom
## Multiple R-squared: 0.1628, Adjusted R-squared: 0.1608
## F-statistic: 81.99 on 15 and 6326 DF, p-value: < 2.2e-16
kable(summary(lm.1)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 83904.539 | 2634.491 | 31.848 | 0.0000 |
| caseid | -0.324 | 0.233 | -1.394 | 0.1632 |
| birth_countryOther | 4245.462 | 2930.504 | 1.449 | 0.1475 |
| expect_educationmiddle_high | -25934.248 | 1327.800 | -19.532 | 0.0000 |
| expect_educationprimary | -39654.483 | 10431.500 | -3.801 | 0.0001 |
| raceblack | -17276.911 | 1751.408 | -9.865 | 0.0000 |
| racehispanic | -10195.590 | 2080.578 | -4.900 | 0.0000 |
| genderfemale | -27623.306 | 1335.661 | -20.681 | 0.0000 |
| num_drug_19841-9 | 2201.898 | 1659.169 | 1.327 | 0.1845 |
| num_drug_198410-39 | 7427.039 | 2241.521 | 3.313 | 0.0009 |
| num_drug_198440-99 | 2789.252 | 2608.926 | 1.069 | 0.2851 |
| num_drug_1984100-999 | 629.600 | 2469.092 | 0.255 | 0.7987 |
| num_drug_19841000+ | -8180.144 | 2799.728 | -2.922 | 0.0035 |
| marriage_col_2000other | -9017.580 | 1459.458 | -6.179 | 0.0000 |
| familysize_2012 | 1773.393 | 489.099 | 3.626 | 0.0003 |
| jobsnum_2012 | -698.401 | 96.015 | -7.274 | 0.0000 |
Discussion: Now we run a regression model, regress income on gender and all other factor variables.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1608, which means that on average 16% of the variance of income can be explained by other factors.
# regress outcome variable income against gender, race, expect_education, marriage and familsize
lm.2 <- lm(income ~ gender + race + expect_education + marriage_col_2000 + familysize_2012, data = new.nlsy)
lm.2
##
## Call:
## lm(formula = income ~ gender + race + expect_education + marriage_col_2000 +
## familysize_2012, data = new.nlsy)
##
## Coefficients:
## (Intercept) genderfemale
## 74260 -26224
## raceblack racehispanic
## -17845 -10342
## expect_educationmiddle_high expect_educationprimary
## -25525 -34765
## marriage_col_2000other familysize_2012
## -10464 2018
summary(lm.2)
##
## Call:
## lm(formula = income ~ gender + race + expect_education + marriage_col_2000 +
## familysize_2012, data = new.nlsy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90406 -27793 -7760 14470 321386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74260.1 1943.4 38.211 < 2e-16 ***
## genderfemale -26224.1 1314.4 -19.952 < 2e-16 ***
## raceblack -17844.8 1566.5 -11.391 < 2e-16 ***
## racehispanic -10341.8 1802.8 -5.737 1.01e-08 ***
## expect_educationmiddle_high -25525.1 1328.3 -19.217 < 2e-16 ***
## expect_educationprimary -34765.3 10338.2 -3.363 0.000776 ***
## marriage_col_2000other -10464.4 1456.5 -7.184 7.52e-13 ***
## familysize_2012 2018.3 490.7 4.113 3.95e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52170 on 6334 degrees of freedom
## Multiple R-squared: 0.1509, Adjusted R-squared: 0.15
## F-statistic: 160.8 on 7 and 6334 DF, p-value: < 2.2e-16
kable(summary(lm.2)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 74260.134 | 1943.405 | 38.211 | 0.0000 |
| genderfemale | -26224.055 | 1314.356 | -19.952 | 0.0000 |
| raceblack | -17844.845 | 1566.510 | -11.391 | 0.0000 |
| racehispanic | -10341.816 | 1802.806 | -5.737 | 0.0000 |
| expect_educationmiddle_high | -25525.122 | 1328.265 | -19.217 | 0.0000 |
| expect_educationprimary | -34765.265 | 10338.199 | -3.363 | 0.0008 |
| marriage_col_2000other | -10464.380 | 1456.524 | -7.184 | 0.0000 |
| familysize_2012 | 2018.251 | 490.669 | 4.113 | 0.0000 |
Discussion: Now we run a regression model, regress income on gender and all other factor variables.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1608, which means that on average 16% of the variance of income can be explained by other factors.
# Plot the linear regression model
plot(lm.2)
# Collinearity and pairs plots
income.var.names.1 <- c("gender", "race", "expect_education", "marriage_col_2000", "familysize_2012")
pairs(new.nlsy[,income.var.names.1])
Discussion: Now we run a regression model, regress income on gender race, expected eduacation, marriage status and familisize. From the residuals fitted graph, we can see that although there are some outliers, generally the residuals have comparatively fan-shape variance, it is not very constant. From the qqplot, I use a linear model for prediction even if the underlying normality assumptions do not hold. Inthis case, the model is not best fitted but seems good. Scale-location plot This is another version of the residuals vs fitted plot. For residuals vs leverage, we can see that high residuals and high leverage (outliers) skew the model fit away from the rest of data.
From the colinearity table, we can see that in this model, the colinear relationship between each variable is very weak.
# regress outcome variable income against gender, race, expect_education, drug and familsize
lm.3 <- lm(income ~ gender + race + expect_education + num_drug_1984 + familysize_2012, data = new.nlsy)
lm.3
##
## Call:
## lm(formula = income ~ gender + race + expect_education + num_drug_1984 +
## familysize_2012, data = new.nlsy)
##
## Coefficients:
## (Intercept) genderfemale
## 69714.4 -27574.2
## raceblack racehispanic
## -21018.3 -11937.9
## expect_educationmiddle_high expect_educationprimary
## -25983.6 -35323.4
## num_drug_19841-9 num_drug_198410-39
## 702.8 5630.0
## num_drug_198440-99 num_drug_1984100-999
## 297.7 -2948.0
## num_drug_19841000+ familysize_2012
## -13469.7 2979.1
summary(lm.3)
##
## Call:
## lm(formula = income ~ gender + race + expect_education + num_drug_1984 +
## familysize_2012, data = new.nlsy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93548 -28093 -7486 14941 322528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69714.4 2095.8 33.264 < 2e-16 ***
## genderfemale -27574.2 1341.4 -20.557 < 2e-16 ***
## raceblack -21018.3 1511.7 -13.903 < 2e-16 ***
## racehispanic -11937.9 1799.3 -6.635 3.52e-11 ***
## expect_educationmiddle_high -25983.6 1327.6 -19.572 < 2e-16 ***
## expect_educationprimary -35323.4 10372.8 -3.405 0.000665 ***
## num_drug_19841-9 702.8 1660.7 0.423 0.672146
## num_drug_198410-39 5630.0 2244.6 2.508 0.012158 *
## num_drug_198440-99 297.7 2610.2 0.114 0.909184
## num_drug_1984100-999 -2948.0 2458.5 -1.199 0.230522
## num_drug_19841000+ -13469.7 2764.9 -4.872 1.13e-06 ***
## familysize_2012 2979.1 469.3 6.348 2.34e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52240 on 6330 degrees of freedom
## Multiple R-squared: 0.1491, Adjusted R-squared: 0.1476
## F-statistic: 100.8 on 11 and 6330 DF, p-value: < 2.2e-16
kable(summary(lm.3)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 69714.376 | 2095.816 | 33.264 | 0.0000 |
| genderfemale | -27574.157 | 1341.352 | -20.557 | 0.0000 |
| raceblack | -21018.264 | 1511.739 | -13.903 | 0.0000 |
| racehispanic | -11937.941 | 1799.327 | -6.635 | 0.0000 |
| expect_educationmiddle_high | -25983.624 | 1327.618 | -19.572 | 0.0000 |
| expect_educationprimary | -35323.360 | 10372.841 | -3.405 | 0.0007 |
| num_drug_19841-9 | 702.834 | 1660.665 | 0.423 | 0.6721 |
| num_drug_198410-39 | 5629.991 | 2244.599 | 2.508 | 0.0122 |
| num_drug_198440-99 | 297.748 | 2610.175 | 0.114 | 0.9092 |
| num_drug_1984100-999 | -2948.049 | 2458.490 | -1.199 | 0.2305 |
| num_drug_19841000+ | -13469.729 | 2764.865 | -4.872 | 0.0000 |
| familysize_2012 | 2979.142 | 469.338 | 6.348 | 0.0000 |
Discussion: Now we run a regression model, regress income on gender, race, expect_eudcation, num_drug_1984 and familysize.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. But the p value for number of times having drug is very high, which suggests that drug maybe not a significant factor.The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1476, which means that on average 15% of the variance of income can be explained by other factors.
# regress outcome variable income against gender, race, expect_education, jobsnum, marriage and familysize
lm.4 <- lm(income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + familysize_2012, data = new.nlsy)
lm.4
##
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education +
## marriage_col_2000 + familysize_2012, data = new.nlsy)
##
## Coefficients:
## (Intercept) genderfemale
## 84405.5 -27093.3
## raceblack racehispanic
## -18556.2 -10712.6
## jobsnum_2012 expect_educationmiddle_high
## -723.9 -26361.8
## expect_educationprimary marriage_col_2000other
## -38791.6 -9352.6
## familysize_2012
## 1782.4
summary(lm.4)
##
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education +
## marriage_col_2000 + familysize_2012, data = new.nlsy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96493 -28371 -7669 14704 322047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 84405.46 2340.59 36.062 < 2e-16 ***
## genderfemale -27093.30 1313.21 -20.631 < 2e-16 ***
## raceblack -18556.15 1562.08 -11.879 < 2e-16 ***
## racehispanic -10712.56 1795.21 -5.967 2.54e-09 ***
## jobsnum_2012 -723.92 94.01 -7.700 1.57e-14 ***
## expect_educationmiddle_high -26361.75 1326.65 -19.871 < 2e-16 ***
## expect_educationprimary -38791.59 10304.23 -3.765 0.000168 ***
## marriage_col_2000other -9352.59 1457.04 -6.419 1.47e-10 ***
## familysize_2012 1782.42 489.39 3.642 0.000273 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51930 on 6333 degrees of freedom
## Multiple R-squared: 0.1588, Adjusted R-squared: 0.1577
## F-statistic: 149.4 on 8 and 6333 DF, p-value: < 2.2e-16
kable(summary(lm.3)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 69714.376 | 2095.816 | 33.264 | 0.0000 |
| genderfemale | -27574.157 | 1341.352 | -20.557 | 0.0000 |
| raceblack | -21018.264 | 1511.739 | -13.903 | 0.0000 |
| racehispanic | -11937.941 | 1799.327 | -6.635 | 0.0000 |
| expect_educationmiddle_high | -25983.624 | 1327.618 | -19.572 | 0.0000 |
| expect_educationprimary | -35323.360 | 10372.841 | -3.405 | 0.0007 |
| num_drug_19841-9 | 702.834 | 1660.665 | 0.423 | 0.6721 |
| num_drug_198410-39 | 5629.991 | 2244.599 | 2.508 | 0.0122 |
| num_drug_198440-99 | 297.748 | 2610.175 | 0.114 | 0.9092 |
| num_drug_1984100-999 | -2948.049 | 2458.490 | -1.199 | 0.2305 |
| num_drug_19841000+ | -13469.729 | 2764.865 | -4.872 | 0.0000 |
| familysize_2012 | 2979.142 | 469.338 | 6.348 | 0.0000 |
Discussion: Now we run a regression model, regress income on gender, race, expect_eudcation, jobsnum, marriage and familysize.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. But the p value for number of times having drug is very high, which suggests that drug maybe not a significant factor.The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1577, which means that on average 16% of the variance of income can be explained by other factors.
Interpret the coefficient of genderfemale: on average, hold other variables constant, the income of female is $27574 less than male.
# Calculate race-specific intercepts
intercepts <- c(coef(lm.4)["(Intercept)"],
coef(lm.4)["(Intercept)"] + coef(lm.4)["raceblack"],
coef(lm.4)["(Intercept)"] + coef(lm.4)["racehispanic"])
lines.df <- data.frame(intercepts = intercepts,
slopes = rep(coef(lm.4)["jobsnum_2012"], 3),
race = levels(new.nlsy$race))
qplot(x = jobsnum_2012, y = income, color = race, data = new.nlsy) +
geom_abline(aes(intercept = intercepts,
slope = slopes,
color = race), data = lines.df)
Interpreting the coefficient for race: The baseline race is other, therefore at this time for raceblack and racehispanic, Among people of the same jobs number in 2012, income of racehispanic people are on average higher than income of raceblack people.
# Plot the linear regression model
plot(lm.4)
# Collinearity and pairs plots
income.var.names.3 <- c("gender", "race", "jobsnum_2012", "expect_education", "familysize_2012","marriage_col_2000")
pairs(new.nlsy[,income.var.names.3])
Discussion: Now we run a regression model, regress income on gender race, expected eduacation, marriage status and familisize. From the residuals fitted graph, we can see that although there are some outliers, generally the residuals have comparatively fan-shape variance, it is not very constant. From the qqplot, I use a linear model for prediction even if the underlying normality assumptions do not hold. Inthis case, the model is not best fitted but seems good. Scale-location plot This is another version of the residuals vs fitted plot. For residuals vs leverage, we can see that high residuals and high leverage (outliers) skew the model fit away from the rest of data.
From the colinearity table, this time although the colinear relationship between different factor varibales is still weak, it is better than former model. We can see there are some colinearity between jobsnum_2012, expected education and familysize.
#Whether race, marriage,familysize is a significant predictor of income differences between male and female
anova(update(lm.4, . ~ . - race), lm.4)
## Analysis of Variance Table
##
## Model 1: income ~ gender + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6335 17470336498276
## 2 6333 17077456571730 2 392879926546 72.848 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(update(lm.4, . ~ . - marriage_col_2000), lm.4)
## Analysis of Variance Table
##
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6334 17188562072246
## 2 6333 17077456571730 1 111105500517 41.202 1.472e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(update(lm.4, . ~ . - familysize_2012), lm.4)
## Analysis of Variance Table
##
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6334 17113227527406
## 2 6333 17077456571730 1 35770955677 13.265 0.0002725 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Discussion: We can see from the three ANOVA outputs that the p values are smaller than 0.05, which means that there is statistically significance.
#interact gender with race
lm.4.interact <- update(lm.4, . ~ . + gender*race)
summary(lm.4.interact)
##
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education +
## marriage_col_2000 + familysize_2012 + gender:race, data = new.nlsy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -100482 -27473 -8069 14292 329293
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90030.91 2416.78 37.252 < 2e-16 ***
## genderfemale -37888.59 1821.40 -20.802 < 2e-16 ***
## raceblack -31553.64 2187.30 -14.426 < 2e-16 ***
## racehispanic -19553.25 2564.93 -7.623 2.84e-14 ***
## jobsnum_2012 -667.93 93.69 -7.129 1.12e-12 ***
## expect_educationmiddle_high -26221.48 1319.07 -19.879 < 2e-16 ***
## expect_educationprimary -39710.29 10249.31 -3.874 0.000108 ***
## marriage_col_2000other -9924.22 1450.25 -6.843 8.48e-12 ***
## familysize_2012 1556.81 487.30 3.195 0.001406 **
## genderfemale:raceblack 25184.59 2982.88 8.443 < 2e-16 ***
## genderfemale:racehispanic 17213.80 3523.87 4.885 1.06e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51630 on 6331 degrees of freedom
## Multiple R-squared: 0.1688, Adjusted R-squared: 0.1675
## F-statistic: 128.6 on 10 and 6331 DF, p-value: < 2.2e-16
kable(coef(summary(lm.4)), digits = c(0, 0, 2, 4))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 84405 | 2341 | 36.06 | 0.0000 |
| genderfemale | -27093 | 1313 | -20.63 | 0.0000 |
| raceblack | -18556 | 1562 | -11.88 | 0.0000 |
| racehispanic | -10713 | 1795 | -5.97 | 0.0000 |
| jobsnum_2012 | -724 | 94 | -7.70 | 0.0000 |
| expect_educationmiddle_high | -26362 | 1327 | -19.87 | 0.0000 |
| expect_educationprimary | -38792 | 10304 | -3.76 | 0.0002 |
| marriage_col_2000other | -9353 | 1457 | -6.42 | 0.0000 |
| familysize_2012 | 1782 | 489 | 3.64 | 0.0003 |
kable(coef(summary(lm.4.interact)), digits = c(0, 0, 2, 4))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 90031 | 2417 | 37.25 | 0.0000 |
| genderfemale | -37889 | 1821 | -20.80 | 0.0000 |
| raceblack | -31554 | 2187 | -14.43 | 0.0000 |
| racehispanic | -19553 | 2565 | -7.62 | 0.0000 |
| jobsnum_2012 | -668 | 94 | -7.13 | 0.0000 |
| expect_educationmiddle_high | -26221 | 1319 | -19.88 | 0.0000 |
| expect_educationprimary | -39710 | 10249 | -3.87 | 0.0001 |
| marriage_col_2000other | -9924 | 1450 | -6.84 | 0.0000 |
| familysize_2012 | 1557 | 487 | 3.19 | 0.0014 |
| genderfemale:raceblack | 25185 | 2983 | 8.44 | 0.0000 |
| genderfemale:racehispanic | 17214 | 3524 | 4.88 | 0.0000 |
| Discussion; According to the k | ables, when | we interact | gender and | race, we can interpret the genderfemale:raceblak that the income difference among blackrace female and blackrace male is $225285 more than the difference between hispanic female and hispanic male. Race is a factor that is highly associated with the income gap between men and women. The coefficients for these factor variables change a lot, which further explains that race is an important factor. |
# Use ANOVA to further discuss whether there is statistically significance in income gaps between different races.
anova(lm.4, lm.4.interact)
## Analysis of Variance Table
##
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012 + gender:race
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6333 17077456571730
## 2 6331 16873273379046 2 204183192684 38.306 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new.nlsy %>%
group_by(expect_education) %>%
summarize(income.gap = mean(income[gender == "male"]) -
mean(income[gender == "female"]))
## # A tibble: 3 x 2
## expect_education income.gap
## <fct> <dbl>
## 1 college 35791.
## 2 middle_high 15193.
## 3 primary 16898.
Discussion: This is the imcome gap betwen different expected education level. It is amazing that the income gap within primary school group is higher than the middle_high school group, which means that with higher level of educations does not directly leads to larger income differences with other people.
# Make some error bars to better show the above table
gap <- new.nlsy %>%
group_by(expect_education) %>%
summarize(income.gap = mean(income[gender == "male"]) -
mean(income[gender == "female"]))
gap<- mutate(gap,
expect_education = reorder(expect_education, -income.gap))
ggplot(data = gap, aes(x = expect_education, y = income.gap, fill = expect_education)) +
geom_bar(stat = "identity") +
xlab("expected education") +
ylab("Income gap by education") +
ggtitle("Income gap between male and female, by expected education") +
guides(fill = FALSE)
# Calculate income gaps (male - female) and 95% confidence intervals
gap.conf <- new.nlsy %>%
group_by(expect_education) %>%
summarize(income.gap = mean(income[gender == "male"]) -
mean(income[gender == "female"]),
upper = t.test(income ~ gender)$conf.int[1],
lower = t.test(income ~ gender)$conf.int[2],
is.significant = as.numeric(t.test(income ~ gender)$p.value < 0.05))
# reorder the expected education factor according to gap size
gap.conf <- mutate(gap.conf,
expect_education = reorder(expect_education, income.gap))
# error bar plots
ggplot(data = gap.conf, aes(x = expect_education, y = income.gap,
fill = is.significant)) +
geom_bar(stat = "identity") +
xlab("expected education") +
ylab("income gap by expected education") +
ggtitle("Income gap between male and female by expected education") +
guides(fill = FALSE) +
geom_errorbar(aes(ymax = upper, ymin = lower), width = 0.1, size = 1) +
theme(text = element_text(size=12))
# The outcome variable of income in 2012 is topcoded, and we can see from the analysis above that the extreme high income values somewhat affect the results. Therefore, I choose to remove observations that earn highest income to run regression.
# Find the exact value of extreme income values
max(new.nlsy$income)
## [1] 343830
# Create a new dataframe that does not contain top 2% of income
new.nlsy.2 <- subset(new.nlsy, income!= 343830)
new.nlsy.2
## # A tibble: 6,207 x 11
## versionid caseid birth_country expect_education race gender income
## <dbl> <dbl> <fct> <fct> <fct> <fct> <dbl>
## 1 445 2 Other middle_high other female 19000
## 2 445 3 US college other female 35000
## 3 445 6 US college other male 105000
## 4 445 8 US college other female 40000
## 5 445 9 US college other male 75000
## 6 445 14 US college other female 102000
## 7 445 15 US college other male 0
## 8 445 16 US college other female 70000
## 9 445 17 US college other male 60000
## 10 445 18 US college other male 150000
## # ... with 6,197 more rows, and 4 more variables: num_drug_1984 <fct>,
## # marriage_col_2000 <fct>, familysize_2012 <dbl>, jobsnum_2012 <dbl>
# Convert variables to factors and give the factors more meaningful levels
new.nlsy.2 <- mutate(new.nlsy.2,
race = recode_factor(race,
`3` = "other",
`2` = "black",
`1` = "hispanic"),
gender = recode_factor(gender,
`1` = "male",
`2` = "female"))
new.nlsy.2 <- mutate(new.nlsy.2,
num_drug_1984 = recode_factor(num_drug_1984,
`0` = "never",
`1` = "1-9",
`2` = "10-39",
`3` = "40-99",
`4` = "100-999",
`5` = "1000+"),
marriage_col_2000 = recode_factor(marriage_col_2000,
`2` = "married",
`1` = "other",
`3` = "other"),
birth_country = recode_factor(birth_country,
`1` = "US",
`2` = "Other"))
new.nlsy.2 <- mutate(new.nlsy.2,
expect_education = recode_factor(expect_education,
`13`= "college",
`14`= "college",
`15`= "college",
`16`= "college",
`17`= "college",
`18`= "college",
`7`= "middle_high",
`8`= "middle_high",
`9`= "middle_high",
`10`= "middle_high",
`11`= "middle_high",
`12`= "middle_high",
`1`= "primary",
`2`= "primary",
`3`= "primary",
`4`= "primary",
`5`= "primary",
`6`= "primary"
))
new.nlsy.2
## # A tibble: 6,207 x 11
## versionid caseid birth_country expect_education race gender income
## <dbl> <dbl> <fct> <fct> <fct> <fct> <dbl>
## 1 445 2 Other middle_high other female 19000
## 2 445 3 US college other female 35000
## 3 445 6 US college other male 105000
## 4 445 8 US college other female 40000
## 5 445 9 US college other male 75000
## 6 445 14 US college other female 102000
## 7 445 15 US college other male 0
## 8 445 16 US college other female 70000
## 9 445 17 US college other male 60000
## 10 445 18 US college other male 150000
## # ... with 6,197 more rows, and 4 more variables: num_drug_1984 <fct>,
## # marriage_col_2000 <fct>, familysize_2012 <dbl>, jobsnum_2012 <dbl>
# Make some tables to see the average income when broke down by gender and other factor variables
table.mean.income.a <- new.nlsy.2 %>%
group_by(gender,race) %>%
summarize(mean.income.a = round(mean(income), 0))
kable(spread(table.mean.income.a, gender, mean.income.a),
format = "markdown")
| race | male | female |
|---|---|---|
| other | 52511 | 32076 |
| black | 30317 | 24108 |
| hispanic | 39693 | 28021 |
Discussion: When we remove the topcoded income, the distribution of income among male and female is more balanced.
# Some graphics showing the relationship between gender and income
ggplot(data=new.nlsy.2, aes(x=race, y=income, colour = gender)) + geom_boxplot()
Discussion: After removing the outliers, people’s income divied by gender and race
# Find whether there is any correlations between familysize and income
cor(new.nlsy.2$familysize_2012,new.nlsy.2$income)
## [1] 0.06185022
# Does the correlation vary by gender?
new.nlsy.2 %>%
group_by(gender) %>%
summarize(cor_income_family = cor(income, familysize_2012))
## # A tibble: 2 x 2
## gender cor_income_family
## <fct> <dbl>
## 1 male 0.177
## 2 female -0.0358
# Does the correlation vary by education and num_drug_1984?
new.nlsy.2 %>%
group_by(expect_education, num_drug_1984) %>%
summarize(cor_income_family = cor(income, familysize_2012))
## # A tibble: 14 x 3
## # Groups: expect_education [3]
## expect_education num_drug_1984 cor_income_family
## <fct> <fct> <dbl>
## 1 college never 0.0555
## 2 college 1-9 0.0946
## 3 college 10-39 0.0395
## 4 college 40-99 0.118
## 5 college 100-999 0.0505
## 6 college 1000+ -0.0231
## 7 middle_high never 0.000914
## 8 middle_high 1-9 0.0562
## 9 middle_high 10-39 0.131
## 10 middle_high 40-99 0.0831
## 11 middle_high 100-999 0.0147
## 12 middle_high 1000+ 0.186
## 13 primary never -0.359
## 14 primary 1-9 -1
Discussion: I decide to run some correlations between income, gender and other factor variables.Broken down by gender, the correlation between income and familysize within female is -0.03578633, which is suprising. Also, when broken down by expected education and number of drugs used in 1984, people who are expected to have college, havig drugs 1000+ times and who are expected to have middle_high never had drug before show negative correlation too.
# Testing differences in income bewteen male and female
qplot(x = gender, y = income,
geom = "boxplot", data = new.nlsy.2,
xlab = "gender",
ylab = "income in 2012",
fill = I("lightblue"))
Discussion: First I use boxplot to see the distribution of income level broken down by gender. Generally, male’s income is high than female’s. The median of male’s income is around 47500 and the median of female’s income is around 25000.
# Find the mean, standard deviation and standard errors to see wether the relationship between gender and income is statistically significant
new.nlsy.2 %>%
group_by(gender) %>%
summarize(num.obs = n(),
mean.income = round(mean(income), 0),
sd.income = round(sd(income), 0),
se.income = round(sd(income) / sqrt(num.obs), 0))
## # A tibble: 2 x 5
## gender num.obs mean.income sd.income se.income
## <fct> <int> <dbl> <dbl> <dbl>
## 1 male 2901 43154 38696 718
## 2 female 3306 28857 30550 531
Discussion: we can see that after removing the topcoded income values, the differences of average income between male and female is smaller.
# Run a two-sample t-test
income.t.test.a <- t.test(income ~ gender, data = new.nlsy.2)
income.t.test.a
##
## Welch Two Sample t-test
##
## data: income by gender
## t = 16, df = 5496.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 12545.62 16049.11
## sample estimates:
## mean in group male mean in group female
## 43154.29 28856.92
Discussion: We can see that the p value is smaller than 2.2e-16, which means that we are 95% confident that the difference in mean between male and female is statistically significant.
# p value
income.t.test.a$p.value
## [1] 2.343502e-56
Discussion: The ttest is very small.
# group means in male and female
income.t.test.a$estimate
## mean in group male mean in group female
## 43154.29 28856.92
Discussion: There is significant differences between average income between male and female (43154.29 and 28856.92). We are 95% confident that averge income in male is $14297.37 higher than in female.
# confidence interval for the difference
income.t.test.a$conf.int
## [1] 12545.62 16049.11
## attr(,"conf.level")
## [1] 0.95
Discussion: The confidence interval is [12545.62,16049.11]
# Also, try to run a wilcox test
income.wilcox.test.a <- wilcox.test(income ~ gender, data=new.nlsy.2, conf.int=TRUE)
income.wilcox.test.a
##
## Wilcoxon rank sum test with continuity correction
##
## data: income by gender
## W = 5825100, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## 9000 13000
## sample estimates:
## difference in location
## 11000
Discussion: When running a wilcoxcon test, the p value is also smaller than 2.2e-16, which is statitiscally significant.
# exlpore whether data is normal
ggplot(data = new.nlsy.2, aes(sample = income)) + stat_qq() + stat_qq_line() + facet_grid(. ~ gender)
Discussion: After removing the topcoded values, the data seems more normal than before.
# Use ANOVA to test whether there is a significant association between gender and income
summary(aov(income ~ gender, data = new.nlsy.2))
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 315849828529 315849828529 263.9 <2e-16 ***
## Residuals 6205 7426944326143 1196928981
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Use ANOVA to test whether there is a significant association between gender, race, expect_education and income
summary(aov(income ~ gender + race + expect_education, data = new.nlsy.2))
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 315849828529 315849828529 294.1 <2e-16 ***
## race 2 261357016364 130678508182 121.7 <2e-16 ***
## expect_education 2 507077934072 253538967036 236.1 <2e-16 ***
## Residuals 6201 6658509375707 1073779935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Use ANOVA to test whether there is a significant association between gender, num_drug_1984, marriage_col_2000 and income
summary(aov(income ~ gender + num_drug_1984 + marriage_col_2000, data = new.nlsy.2))
## Df Sum Sq Mean Sq F value Pr(>F)
## gender 1 315849828529 315849828529 276.45 < 2e-16 ***
## num_drug_1984 5 65880909821 13176181964 11.53 4.16e-11 ***
## marriage_col_2000 1 278647001869 278647001869 243.89 < 2e-16 ***
## Residuals 6199 7082416414452 1142509504
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Discussion: We can see from the three outputs taht all p values are snaller than 2e-16, which means that the p-value is significant at the 0.05 level, there is an association between income and gender, race, num_drug_1984, marriage_col_2000 and so on.
# regress outcome variable income against all other variables
lm.a <- lm(income ~ ., data = new.nlsy.2)
lm.a
##
## Call:
## lm(formula = income ~ ., data = new.nlsy.2)
##
## Coefficients:
## (Intercept) versionid
## 65090.9297 NA
## caseid birth_countryOther
## -0.2733 4558.5034
## expect_educationmiddle_high expect_educationprimary
## -17465.1710 -31992.6576
## raceblack racehispanic
## -10550.2933 -5644.5820
## genderfemale num_drug_19841-9
## -15990.4213 1845.3201
## num_drug_198410-39 num_drug_198440-99
## 5904.4073 2605.6223
## num_drug_1984100-999 num_drug_19841000+
## 1626.4507 -3216.5358
## marriage_col_2000other familysize_2012
## -7909.7019 398.0047
## jobsnum_2012
## -496.7083
summary(lm.a)
##
## Call:
## lm(formula = income ~ ., data = new.nlsy.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70273 -22912 -4687 17110 142349
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65090.9297 1659.8045 39.216 < 2e-16
## versionid NA NA NA NA
## caseid -0.2733 0.1457 -1.875 0.0608
## birth_countryOther 4558.5034 1840.5488 2.477 0.0133
## expect_educationmiddle_high -17465.1710 832.7235 -20.974 < 2e-16
## expect_educationprimary -31992.6576 6484.5409 -4.934 0.000000828
## raceblack -10550.2933 1096.8558 -9.619 < 2e-16
## racehispanic -5644.5820 1305.3690 -4.324 0.000015554
## genderfemale -15990.4213 841.5838 -19.000 < 2e-16
## num_drug_19841-9 1845.3201 1042.1714 1.771 0.0767
## num_drug_198410-39 5904.4073 1412.4467 4.180 0.000029518
## num_drug_198440-99 2605.6223 1643.6832 1.585 0.1130
## num_drug_1984100-999 1626.4507 1553.5156 1.047 0.2952
## num_drug_19841000+ -3216.5358 1748.9757 -1.839 0.0659
## marriage_col_2000other -7909.7019 913.0887 -8.663 < 2e-16
## familysize_2012 398.0047 307.1770 1.296 0.1951
## jobsnum_2012 -496.7083 60.0314 -8.274 < 2e-16
##
## (Intercept) ***
## versionid
## caseid .
## birth_countryOther *
## expect_educationmiddle_high ***
## expect_educationprimary ***
## raceblack ***
## racehispanic ***
## genderfemale ***
## num_drug_19841-9 .
## num_drug_198410-39 ***
## num_drug_198440-99
## num_drug_1984100-999
## num_drug_19841000+ .
## marriage_col_2000other ***
## familysize_2012
## jobsnum_2012 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32210 on 6191 degrees of freedom
## Multiple R-squared: 0.1705, Adjusted R-squared: 0.1685
## F-statistic: 84.83 on 15 and 6191 DF, p-value: < 2.2e-16
kable(summary(lm.1)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 83904.539 | 2634.491 | 31.848 | 0.0000 |
| caseid | -0.324 | 0.233 | -1.394 | 0.1632 |
| birth_countryOther | 4245.462 | 2930.504 | 1.449 | 0.1475 |
| expect_educationmiddle_high | -25934.248 | 1327.800 | -19.532 | 0.0000 |
| expect_educationprimary | -39654.483 | 10431.500 | -3.801 | 0.0001 |
| raceblack | -17276.911 | 1751.408 | -9.865 | 0.0000 |
| racehispanic | -10195.590 | 2080.578 | -4.900 | 0.0000 |
| genderfemale | -27623.306 | 1335.661 | -20.681 | 0.0000 |
| num_drug_19841-9 | 2201.898 | 1659.169 | 1.327 | 0.1845 |
| num_drug_198410-39 | 7427.039 | 2241.521 | 3.313 | 0.0009 |
| num_drug_198440-99 | 2789.252 | 2608.926 | 1.069 | 0.2851 |
| num_drug_1984100-999 | 629.600 | 2469.092 | 0.255 | 0.7987 |
| num_drug_19841000+ | -8180.144 | 2799.728 | -2.922 | 0.0035 |
| marriage_col_2000other | -9017.580 | 1459.458 | -6.179 | 0.0000 |
| familysize_2012 | 1773.393 | 489.099 | 3.626 | 0.0003 |
| jobsnum_2012 | -698.401 | 96.015 | -7.274 | 0.0000 |
Discussion: Now we run a regression model, regress income on gender and all other factor variables.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1685, which means that on average 17% of the variance of income can be explained by other factors.
# regress outcome variable income against gender, race, expect_education, marriage and familsize
lm.b <- lm(income ~ gender + race + expect_education + marriage_col_2000 + familysize_2012, data = new.nlsy.2)
lm.b
##
## Call:
## lm(formula = income ~ gender + race + expect_education + marriage_col_2000 +
## familysize_2012, data = new.nlsy.2)
##
## Coefficients:
## (Intercept) genderfemale
## 58657.3 -15172.3
## raceblack racehispanic
## -11199.0 -5712.6
## expect_educationmiddle_high expect_educationprimary
## -17112.1 -27968.6
## marriage_col_2000other familysize_2012
## -8801.5 557.1
summary(lm.b)
##
## Call:
## lm(formula = income ~ gender + race + expect_education + marriage_col_2000 +
## familysize_2012, data = new.nlsy.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63114 -22889 -5175 16785 140401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58657.3 1225.3 47.870 < 2e-16 ***
## genderfemale -15172.3 828.5 -18.313 < 2e-16 ***
## raceblack -11199.0 982.1 -11.403 < 2e-16 ***
## racehispanic -5712.6 1133.4 -5.040 0.000000478 ***
## expect_educationmiddle_high -17112.1 834.3 -20.511 < 2e-16 ***
## expect_educationprimary -27968.6 6435.3 -4.346 0.000014078 ***
## marriage_col_2000other -8801.5 912.7 -9.643 < 2e-16 ***
## familysize_2012 557.1 308.7 1.805 0.0711 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32470 on 6199 degrees of freedom
## Multiple R-squared: 0.156, Adjusted R-squared: 0.1551
## F-statistic: 163.7 on 7 and 6199 DF, p-value: < 2.2e-16
kable(summary(lm.b)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 58657.316 | 1225.345 | 47.870 | 0.0000 |
| genderfemale | -15172.331 | 828.502 | -18.313 | 0.0000 |
| raceblack | -11198.980 | 982.091 | -11.403 | 0.0000 |
| racehispanic | -5712.615 | 1133.412 | -5.040 | 0.0000 |
| expect_educationmiddle_high | -17112.123 | 834.304 | -20.511 | 0.0000 |
| expect_educationprimary | -27968.635 | 6435.299 | -4.346 | 0.0000 |
| marriage_col_2000other | -8801.471 | 912.699 | -9.643 | 0.0000 |
| familysize_2012 | 557.128 | 308.684 | 1.805 | 0.0711 |
Discussion: Now we run a regression model, regress income on gender and all other factor variables.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1551, which means that on average 15% of the variance of income can be explained by other factors.
# Plot the linear regression model
plot(lm.b)
# Collinearity and pairs plots
income.var.names.b <- c("gender", "race", "expect_education", "marriage_col_2000", "familysize_2012")
pairs(new.nlsy.2[,income.var.names.b])
Discussion: Now we run a regression model, regress income on gender race, expected eduacation, marriage status and familisize. From the residuals fitted graph, we can see that although there are some outliers, generally the residuals have comparatively fan-shape variance, it is not very constant. From the qqplot, I use a linear model for prediction even if the underlying normality assumptions do not hold. Inthis case, the model is not best fitted but seems good. Scale-location plot This is another version of the residuals vs fitted plot. For residuals vs leverage, we can see that high residuals and high leverage (outliers) skew the model fit away from the rest of data.
From the colinearity table, we can see that in this model, the colinear relationship between each variable is very weak.
# regress outcome variable income against gender, race, expect_education, drug and familsize
lm.c <- lm(income ~ gender + race + expect_education + num_drug_1984 + familysize_2012, data = new.nlsy.2)
lm.c
##
## Call:
## lm(formula = income ~ gender + race + expect_education + num_drug_1984 +
## familysize_2012, data = new.nlsy.2)
##
## Coefficients:
## (Intercept) genderfemale
## 54127.6 -15973.3
## raceblack racehispanic
## -13795.0 -6964.7
## expect_educationmiddle_high expect_educationprimary
## -17600.6 -28190.3
## num_drug_19841-9 num_drug_198410-39
## 686.4 4536.8
## num_drug_198440-99 num_drug_1984100-999
## 639.6 -1162.4
## num_drug_19841000+ familysize_2012
## -7288.1 1407.7
summary(lm.c)
##
## Call:
## lm(formula = income ~ gender + race + expect_education + num_drug_1984 +
## familysize_2012, data = new.nlsy.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67110 -24009 -4777 17121 141538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54127.6 1327.8 40.764 < 2e-16 ***
## genderfemale -15973.3 849.4 -18.805 < 2e-16 ***
## raceblack -13795.0 951.7 -14.495 < 2e-16 ***
## racehispanic -6964.7 1135.5 -6.133 9.13e-10 ***
## expect_educationmiddle_high -17600.6 836.8 -21.033 < 2e-16 ***
## expect_educationprimary -28190.3 6479.9 -4.350 1.38e-05 ***
## num_drug_19841-9 686.4 1048.3 0.655 0.51266
## num_drug_198410-39 4536.8 1421.4 3.192 0.00142 **
## num_drug_198440-99 639.6 1652.6 0.387 0.69873
## num_drug_1984100-999 -1162.4 1554.5 -0.748 0.45461
## num_drug_19841000+ -7288.1 1735.8 -4.199 2.72e-05 ***
## familysize_2012 1407.7 296.5 4.747 2.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32630 on 6195 degrees of freedom
## Multiple R-squared: 0.1483, Adjusted R-squared: 0.1468
## F-statistic: 98.06 on 11 and 6195 DF, p-value: < 2.2e-16
kable(summary(lm.c)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 54127.602 | 1327.837 | 40.764 | 0.0000 |
| genderfemale | -15973.331 | 849.411 | -18.805 | 0.0000 |
| raceblack | -13794.998 | 951.708 | -14.495 | 0.0000 |
| racehispanic | -6964.739 | 1135.533 | -6.133 | 0.0000 |
| expect_educationmiddle_high | -17600.633 | 836.821 | -21.033 | 0.0000 |
| expect_educationprimary | -28190.254 | 6479.872 | -4.350 | 0.0000 |
| num_drug_19841-9 | 686.359 | 1048.294 | 0.655 | 0.5127 |
| num_drug_198410-39 | 4536.848 | 1421.415 | 3.192 | 0.0014 |
| num_drug_198440-99 | 639.641 | 1652.618 | 0.387 | 0.6987 |
| num_drug_1984100-999 | -1162.450 | 1554.484 | -0.748 | 0.4546 |
| num_drug_19841000+ | -7288.087 | 1735.791 | -4.199 | 0.0000 |
| familysize_2012 | 1407.660 | 296.538 | 4.747 | 0.0000 |
Discussion: Now we run a regression model, regress income on gender, race, expect_eudcation, num_drug_1984 and familysize.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. But the p value for number of times having drug is very high, which suggests that drug maybe not a significant factor.The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1468, which means that on average 15% of the variance of income can be explained by other factors.
# regress outcome variable income against gender, race, expect_education, jobsnum, marriage and familysize
lm.d <- lm(income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 + familysize_2012, data = new.nlsy.2)
lm.d
##
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education +
## marriage_col_2000 + familysize_2012, data = new.nlsy.2)
##
## Coefficients:
## (Intercept) genderfemale
## 65702.4 -15813.7
## raceblack racehispanic
## -11722.2 -5979.2
## jobsnum_2012 expect_educationmiddle_high
## -497.4 -17715.8
## expect_educationprimary marriage_col_2000other
## -30758.6 -8043.8
## familysize_2012
## 394.5
summary(lm.d)
##
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education +
## marriage_col_2000 + familysize_2012, data = new.nlsy.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67366 -22943 -4603 16960 142299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65702.4 1475.9 44.518 < 2e-16 ***
## genderfemale -15813.7 827.3 -19.115 < 2e-16 ***
## raceblack -11722.2 978.5 -11.980 < 2e-16 ***
## racehispanic -5979.2 1127.5 -5.303 0.000000118 ***
## jobsnum_2012 -497.4 58.8 -8.459 < 2e-16 ***
## expect_educationmiddle_high -17715.8 832.7 -21.276 < 2e-16 ***
## expect_educationprimary -30758.6 6407.5 -4.800 0.000001620 ***
## marriage_col_2000other -8043.8 912.0 -8.820 < 2e-16 ***
## familysize_2012 394.5 307.5 1.283 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32280 on 6198 degrees of freedom
## Multiple R-squared: 0.1657, Adjusted R-squared: 0.1646
## F-statistic: 153.8 on 8 and 6198 DF, p-value: < 2.2e-16
kable(summary(lm.d)$coef, digits = c(3, 3, 3, 4), format = 'markdown')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 65702.419 | 1475.878 | 44.518 | 0.0000 |
| genderfemale | -15813.737 | 827.309 | -19.115 | 0.0000 |
| raceblack | -11722.201 | 978.506 | -11.980 | 0.0000 |
| racehispanic | -5979.182 | 1127.457 | -5.303 | 0.0000 |
| jobsnum_2012 | -497.428 | 58.804 | -8.459 | 0.0000 |
| expect_educationmiddle_high | -17715.781 | 832.660 | -21.276 | 0.0000 |
| expect_educationprimary | -30758.615 | 6407.481 | -4.800 | 0.0000 |
| marriage_col_2000other | -8043.809 | 911.958 | -8.820 | 0.0000 |
| familysize_2012 | 394.504 | 307.543 | 1.283 | 0.1996 |
Discussion: Now we run a regression model, regress income on gender, race, expect_eudcation, jobsnum, marriage and familysize.We can see that most p values are very small even zero (race and expected education), which means that race may be a very significant factor influncing income differences between male and female. But the p value for number of times having drug is very high, which suggests that drug maybe not a significant factor.The p value are all positive, so income is positively associated with all factors. Also for expected education and marriage status. The R squared is 0.1646, which means that on average 16% of the variance of income can be explained by other factors.
Interpret the coefficient of genderfemale: on average, hold other variables constant, the income of female is $15813 less than male.
# Calculate race-specific intercepts
intercepts <- c(coef(lm.d)["(Intercept)"],
coef(lm.d)["(Intercept)"] + coef(lm.d)["raceblack"],
coef(lm.d)["(Intercept)"] + coef(lm.d)["racehispanic"])
lines.df.d <- data.frame(intercepts = intercepts,
slopes = rep(coef(lm.d)["jobsnum_2012"], 3),
race = levels(new.nlsy.2$race))
qplot(x = jobsnum_2012, y = income, color = race, data = new.nlsy.2) +
geom_abline(aes(intercept = intercepts,
slope = slopes,
color = race), data = lines.df.d)
Interpreting the coefficient for race: The baseline race is other, therefore at this time for raceblack and racehispanic, Among people of the same jobs number in 2012, income of racehispanic people are on average higher than income of raceblack people.
# Plot the linear regression model
plot(lm.d)
# Collinearity and pairs plots
income.var.names.d <- c("gender", "race", "jobsnum_2012", "expect_education", "familysize_2012","marriage_col_2000")
pairs(new.nlsy.2[,income.var.names.d])
Discussion: Now we run a regression model, regress income on gender race, expected eduacation, marriage status and familisize. From the residuals fitted graph, we can see that although there are some outliers, generally the residuals have comparatively fan-shape variance, it is not very constant. From the qqplot, I use a linear model for prediction even if the underlying normality assumptions do not hold. Inthis case, the model is not best fitted but seems good. Scale-location plot This is another version of the residuals vs fitted plot. For residuals vs leverage, we can see that high residuals and high leverage (outliers) skew the model fit away from the rest of data.
From the colinearity table, this time although the colinear relationship between different factor varibales is still weak, it is better than former model. We can see there are some colinearity between jobsnum_2012, expected education and familysize.
#Whether race, marriage,familysize is a significant predictor of income differences between male and female
anova(update(lm.d, . ~ . - race), lm.d)
## Analysis of Variance Table
##
## Model 1: income ~ gender + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6200 6611478109255
## 2 6198 6459998462703 2 151479646553 72.668 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(update(lm.d, . ~ . - marriage_col_2000), lm.d)
## Analysis of Variance Table
##
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6199 6541086061501
## 2 6198 6459998462703 1 81087598798 77.799 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(update(lm.d, . ~ . - familysize_2012), lm.d)
## Analysis of Variance Table
##
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6199 6461713486522
## 2 6198 6459998462703 1 1715023819 1.6455 0.1996
Discussion: We can see from the three ANOVA outputs that the p values are smaller than 0.05, which means that there is statistically significance.
#interact gender with race
lm.d.interact <- update(lm.d, . ~ . + gender*race)
summary(lm.d.interact)
##
## Call:
## lm(formula = income ~ gender + race + jobsnum_2012 + expect_education +
## marriage_col_2000 + familysize_2012 + gender:race, data = new.nlsy.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69751 -22376 -4625 16737 144049
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68865.25 1530.20 45.004 < 2e-16 ***
## genderfemale -21704.88 1155.87 -18.778 < 2e-16 ***
## raceblack -18980.35 1381.66 -13.737 < 2e-16 ***
## racehispanic -10545.79 1627.03 -6.482 9.77e-11 ***
## jobsnum_2012 -468.79 58.68 -7.990 1.60e-15 ***
## expect_educationmiddle_high -17687.62 829.01 -21.336 < 2e-16 ***
## expect_educationprimary -31196.14 6382.10 -4.888 1.04e-06 ***
## marriage_col_2000other -8369.27 909.02 -9.207 < 2e-16 ***
## familysize_2012 286.52 306.58 0.935 0.35
## genderfemale:raceblack 13868.66 1872.88 7.405 1.49e-13 ***
## genderfemale:racehispanic 8714.34 2217.69 3.929 8.61e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32140 on 6196 degrees of freedom
## Multiple R-squared: 0.1733, Adjusted R-squared: 0.172
## F-statistic: 129.9 on 10 and 6196 DF, p-value: < 2.2e-16
kable(coef(summary(lm.d)), digits = c(0, 0, 2, 4))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 65702 | 1476 | 44.52 | 0.0000 |
| genderfemale | -15814 | 827 | -19.11 | 0.0000 |
| raceblack | -11722 | 979 | -11.98 | 0.0000 |
| racehispanic | -5979 | 1127 | -5.30 | 0.0000 |
| jobsnum_2012 | -497 | 59 | -8.46 | 0.0000 |
| expect_educationmiddle_high | -17716 | 833 | -21.28 | 0.0000 |
| expect_educationprimary | -30759 | 6407 | -4.80 | 0.0000 |
| marriage_col_2000other | -8044 | 912 | -8.82 | 0.0000 |
| familysize_2012 | 395 | 308 | 1.28 | 0.1996 |
kable(coef(summary(lm.d.interact)), digits = c(0, 0, 2, 4))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 68865 | 1530 | 45.00 | 0.0000 |
| genderfemale | -21705 | 1156 | -18.78 | 0.0000 |
| raceblack | -18980 | 1382 | -13.74 | 0.0000 |
| racehispanic | -10546 | 1627 | -6.48 | 0.0000 |
| jobsnum_2012 | -469 | 59 | -7.99 | 0.0000 |
| expect_educationmiddle_high | -17688 | 829 | -21.34 | 0.0000 |
| expect_educationprimary | -31196 | 6382 | -4.89 | 0.0000 |
| marriage_col_2000other | -8369 | 909 | -9.21 | 0.0000 |
| familysize_2012 | 287 | 307 | 0.93 | 0.3500 |
| genderfemale:raceblack | 13869 | 1873 | 7.41 | 0.0000 |
| genderfemale:racehispanic | 8714 | 2218 | 3.93 | 0.0001 |
| Discussion; According to the k | ables, when | we interact | gender and | race, we can interpret the genderfemale:raceblak that the income difference among blackrace female and blackrace male is $13868.66 more than the difference between hispanic female and hispanic male. Race is a factor that is highly associated with the income gap between men and women. The coefficients for these factor variables change a lot, which further explains that race is an important factor. |
# Use ANOVA to further discuss whether there is statistically significance in income gaps between different races.
anova(lm.d, lm.d.interact)
## Analysis of Variance Table
##
## Model 1: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012
## Model 2: income ~ gender + race + jobsnum_2012 + expect_education + marriage_col_2000 +
## familysize_2012 + gender:race
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 6198 6459998462703
## 2 6196 6400667700183 2 59330762520 28.717 3.854e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new.nlsy.2 %>%
group_by(expect_education) %>%
summarize(income.gap = mean(income[gender == "male"]) -
mean(income[gender == "female"]))
## # A tibble: 3 x 2
## expect_education income.gap
## <fct> <dbl>
## 1 college 17756.
## 2 middle_high 12161.
## 3 primary 16898.
Discussion: This is the imcome gap betwen different expected education level. It is amazing that the income gap within primary school group is higher than the middle_high school group, which means that with higher level of educations does not directly leads to larger income differences with other people.
# Make some error bars to better show the above table
gap <- new.nlsy.2 %>%
group_by(expect_education) %>%
summarize(income.gap = mean(income[gender == "male"]) -
mean(income[gender == "female"]))
gap<- mutate(gap,
expect_education = reorder(expect_education, -income.gap))
ggplot(data = gap, aes(x = expect_education, y = income.gap, fill = expect_education)) +
geom_bar(stat = "identity") +
xlab("expected education") +
ylab("Income gap by education") +
ggtitle("Income gap between male and female, by expected education") +
guides(fill = FALSE)
# Calculate income gaps (male - female) and 95% confidence intervals
gap.conf <- new.nlsy.2 %>%
group_by(expect_education) %>%
summarize(income.gap = mean(income[gender == "male"]) -
mean(income[gender == "female"]),
upper = t.test(income ~ gender)$conf.int[1],
lower = t.test(income ~ gender)$conf.int[2],
is.significant = as.numeric(t.test(income ~ gender)$p.value < 0.05))
# reorder the expected education factor according to gap size
gap.conf <- mutate(gap.conf,
expect_education = reorder(expect_education, income.gap))
# error bar plots
ggplot(data = gap.conf, aes(x = expect_education, y = income.gap,
fill = is.significant)) +
geom_bar(stat = "identity") +
xlab("expected education") +
ylab("income gap by expected education") +
ggtitle("Income gap between male and female by expected education") +
guides(fill = FALSE) +
geom_errorbar(aes(ymax = upper, ymin = lower), width = 0.1, size = 1) +
theme(text = element_text(size=12))
In this section you should provide an overview of the approach you took to exploring and analyzing the data. This is where you tell the story of how you got to your main findings. It’s too tedious to carefully format plots and tables for every approach you tried, so you can also use this section as a place to explain the various types of analyses that you tried.
In the raw data set, there are many negative values ranging from -5 to -1 (non interview, valid skip, invalid skip, dont’t know and refusal.), whcih may influence the accuracy of our analysis. Therefore, I first subset the data into a new dataframe cotaining 10+ variables. Then, I transform all the negative values into NA and remove rows that having NA, finally get a smaller dataset with 5826 observations. Removing the missing values allows me a more objective data analysis.
First, I make some data summaries and regression analysis of the data with topcoded incomes, which shows that extreme high income (outliers) affect the causal effect between gender, eduacation, race etc. with income. Therefore, I make a box plot and max() function finding out the exact value of income in 2012, which is 343830. Then I subset the data into a new dataframe that does not contain observations with income of $343830 per year. After removing the topcoded values, the income distribution and income difference among gender and race is more interpretable. Outliers distort the coefficients of regression to some extent. But in this case, when I make regression analysis, there is no distinctive differences.
I think people who frequently have drugs may have lower income. However, it shows that male born in US having some experience of drug even have higher income.
I investigate the relationship between familysize and expect_education, the r squared is 0.0018.
The final analysis that I settled on is setting income as outcome varibale, gender, race, expected education, familisize in 2012, number of times drug used in 1984, number of jobs had in 2012 as related factors to investigate in the final analysis. Expected eudcation, race and familysize are very important factors affecting income by gender. And generally male earn more than female.
Potential limitations including: (1) Most factors are categorical variables, only familysize and number of jobs are numeric, compared to the large numeric value of income, which may distort the objective of analysis. (2) The survey contains questions from 1979 to 2012, it is hard for the long-period continuous interviews to keep consistent, which may meet the situations like duplication and inconsistency. Therefore, I should have more considerations on selecting factor variables that are more likely to have causal impacts on the income in 2012. (Selecting questions that were interviewed in 2012 ) (3) The classification of education level, college, middle_high school, primary school needs more discussion. I recode the 18 levels of education into three levels and baseline variable is college. Maybe it is unnecessary to do that. (4) have tried different factor variables, but r squares are very similar, which means these factors are all important.
Potential confounders: Maybe the number of children one has, one’s parents’ job type.
The models I fit believable.
Probably I am 90% of confident in my analysis. I believe my conclusions. I am confident enough to present them to policy makers.